{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python_defaultSpec_1600504111152",
   "display_name": "Python 3.7.4 64-bit ('base': conda)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "source": [
    "# MNIST"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.datasets import fetch_mldata\n",
    "# from sklearn.datasets import fetch_openml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": "D:\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function fetch_mldata is deprecated; fetch_mldata was deprecated in version 0.20 and will be removed in version 0.22. Please use fetch_openml.\n  warnings.warn(msg, category=DeprecationWarning)\nD:\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function mldata_filename is deprecated; mldata_filename was deprecated in version 0.20 and will be removed in version 0.22. Please use fetch_openml.\n  warnings.warn(msg, category=DeprecationWarning)\n"
    }
   ],
   "source": [
    "mnist = fetch_mldata(\"MNIST original\")\n",
    "# mnist = fetch_openml(\"mnist_784\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "{'DESCR': 'mldata.org dataset: mnist-original',\n 'COL_NAMES': ['label', 'data'],\n 'target': array([0., 0., 0., ..., 9., 9., 9.]),\n 'data': array([[0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0],\n        ...,\n        [0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}"
     },
     "metadata": {},
     "execution_count": 15
    }
   ],
   "source": [
    "mnist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, y =mnist['data'], mnist['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "(70000, 784)"
     },
     "metadata": {},
     "execution_count": 17
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = np.array(X[:60000], dtype=float)\n",
    "y_train = np.array(y[:60000], dtype=float)\n",
    "X_test = np.array(X[60000:], dtype=float)\n",
    "y_test = np.array(X[60000:], dtype=float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "(10000, 784)"
     },
     "metadata": {},
     "execution_count": 19
    }
   ],
   "source": [
    "X_test.shape"
   ]
  },
  {
   "source": [
    "## 使用knn"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "\n",
    "knn_clf = KNeighborsClassifier()\n",
    "%time knn_clf.fit(X_train, y_train) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "NotFittedError",
     "evalue": "This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNotFittedError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[1;32m<timed eval>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36mscore\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m    355\u001b[0m         \"\"\"\n\u001b[0;32m    356\u001b[0m         \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mmetrics\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 357\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    358\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    359\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\neighbors\\classification.py\u001b[0m in \u001b[0;36mpredict\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    147\u001b[0m         \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'csr'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    148\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 149\u001b[1;33m         \u001b[0mneigh_dist\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mneigh_ind\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkneighbors\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    150\u001b[0m         \u001b[0mclasses_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclasses_\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    151\u001b[0m         \u001b[0m_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_y\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\neighbors\\base.py\u001b[0m in \u001b[0;36mkneighbors\u001b[1;34m(self, X, n_neighbors, return_distance)\u001b[0m\n\u001b[0;32m    382\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    383\u001b[0m         \"\"\"\n\u001b[1;32m--> 384\u001b[1;33m         \u001b[0mcheck_is_fitted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m\"_fit_method\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"_fit_X\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mall_or_any\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0many\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    385\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    386\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mn_neighbors\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36mcheck_is_fitted\u001b[1;34m(estimator, attributes, msg, all_or_any)\u001b[0m\n\u001b[0;32m    912\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    913\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mall_or_any\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mhasattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mattr\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mattr\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mattributes\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 914\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mNotFittedError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;34m'name'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    915\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    916\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNotFittedError\u001b[0m: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method."
     ]
    }
   ],
   "source": [
    "#计算约15min,0.985 %time knn_clf.score(X_test, y_test)"
   ]
  },
  {
   "source": [
    "# PCA 进行降维"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "\n",
    "pca = PCA(0.9)\n",
    "pca.fit(X_train)\n",
    "X_train_reduction = pca.transform(X_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "(60000, 87)"
     },
     "metadata": {},
     "execution_count": 23
    }
   ],
   "source": [
    "X_train_reduction.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Wall time: 457 ms\n"
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n                     weights='uniform')"
     },
     "metadata": {},
     "execution_count": 24
    }
   ],
   "source": [
    "knn_clf = KNeighborsClassifier()\n",
    "%time knn_clf.fit(X_train_reduction, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test_reduction  = pca.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "ValueError",
     "evalue": "Classification metrics can't handle a mix of multiclass-multioutput and multiclass targets",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<timed eval>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36mscore\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m    355\u001b[0m         \"\"\"\n\u001b[0;32m    356\u001b[0m         \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mmetrics\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 357\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    358\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    359\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py\u001b[0m in \u001b[0;36maccuracy_score\u001b[1;34m(y_true, y_pred, normalize, sample_weight)\u001b[0m\n\u001b[0;32m    174\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    175\u001b[0m     \u001b[1;31m# Compute accuracy for each possible representation\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 176\u001b[1;33m     \u001b[0my_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_true\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_pred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_check_targets\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    177\u001b[0m     \u001b[0mcheck_consistent_length\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    178\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0my_type\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'multilabel'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py\u001b[0m in \u001b[0;36m_check_targets\u001b[1;34m(y_true, y_pred)\u001b[0m\n\u001b[0;32m     79\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_type\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     80\u001b[0m         raise ValueError(\"Classification metrics can't handle a mix of {0} \"\n\u001b[1;32m---> 81\u001b[1;33m                          \"and {1} targets\".format(type_true, type_pred))\n\u001b[0m\u001b[0;32m     82\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     83\u001b[0m     \u001b[1;31m# We can't have more than one value on y_type => The set is no more needed\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: Classification metrics can't handle a mix of multiclass-multioutput and multiclass targets"
     ]
    }
   ],
   "source": [
    "%time knn_clf.score(X_test_reduction, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}